#!/bin/bash
#============================================================================
# ${XEN_SCRIPT_DIR}/remus-netbuf-setup
#
# Script for attaching a network buffer to the specified vif (in any mode).
# The hotplugging system will call this script when starting remus via libxl
# API, libxl_domain_remus_start.
#
# Usage:
# remus-netbuf-setup (setup|teardown)
#
# Environment vars:
# vifname     vif interface name (required).
# XENBUS_PATH path in Xenstore, where the REMUS_IFB device details will be
#             stored or read from (required).
#             (libxl passes /libxl/<domid>/remus/netbuf/<devid>)
# REMUS_IFB   ifb interface to be cleaned up (required). [for teardown op only]

# Written to the store: (setup operation)
# XENBUS_PATH/ifb=<ifbdevName> the REMUS_IFB device serving
#  as the intermediate buffer through which the interface's network output
#  can be controlled.
#

# Remus network buffering requirements:

# We need to buffer (queue) egress traffic from every vif attached to
# the guest and release the buffers when the checkpoint associated
# with them has been committed at the backup host. We achieve this
# with the help of the plug queuing discipline (sch_plug module).
# Simply put, Remus' network buffering imposes traffic
# shaping on the guest's vif(s).

# Limitations and Workarounds:

# Egress traffic from a vif appears as ingress traffic to dom0. Linux
# supports policing (dropping packets) but not traffic shaping
# (queuing packets) on ingress traffic. The standard workaround to
# this limitation is to attach an ingress qdisc to the guest vif,
# redirect all egress traffic from the guest to an intermediate
# queuing interface, and apply egress rules to it. The IFB
# (Intermediate Functional Block) device serves the purpose of an
# intermediate queuing interface.
#

# The following commands install a network buffer on a
# guest's vif (vif1.0) using an IFB device (ifb0):
#
#  ip link set dev ifb0 up
#  tc qdisc add dev vif1.0 ingress
#  tc filter add dev vif1.0 parent ffff: proto ip \
#    prio 10 u32 match u32 0 0 action mirred egress redirect dev ifb0
#  nl-qdisc-add --dev=ifb0 --parent root plug
#  nl-qdisc-add --dev=ifb0 --parent root --update plug --limit=10000000
#                                                (10MB limit on buffer)
#
# So order of operations when installing a network buffer on vif1.0
# 1. find a free ifb and bring up the device
# 2. redirect traffic from vif1.0 to ifb:
#   2.1 add ingress qdisc to vif1.0 (to capture outgoing packets from guest)
#   2.2 use tc filter command with actions mirred egress + redirect
# 3. install plug_qdisc on ifb device, with which we can buffer/release
#    guest's network output from vif1.0
#
# Note:
# 1. If the setup process fails, the script's cleanup is limited to removing the
#    ingress qdisc on the guest vif, so that its traffic can flow normally.
#    The chosen ifb device is not torn down. Libxl has to execute the
#    teardown op to remove other qdiscs and subsequently free the IFB device.
#
# 2. The teardown op may be invoked multiple times by libxl.

#============================================================================

# Unlike other vif scripts, vif-common is not needed here as it executes vif
#specific setup code such as renaming.
dir=$(dirname "$0")
. "$dir/xen-hotplug-common.sh"

findCommand "$@"

if [ "$command" != "setup" -a  "$command" != "teardown" ]
then
  echo "Invalid command: $command"
  log err "Invalid command: $command"
  exit 1
fi

evalVariables "$@"

: ${vifname:?}
: ${XENBUS_PATH:?}

check_libnl_tools() {
    if ! command -v nl-qdisc-list > /dev/null 2>&1; then
        fatal "Unable to find nl-qdisc-list tool"
    fi
    if ! command -v nl-qdisc-add > /dev/null 2>&1; then
        fatal "Unable to find nl-qdisc-add tool"
    fi
    if ! command -v nl-qdisc-delete > /dev/null 2>&1; then
        fatal "Unable to find nl-qdisc-delete tool"
    fi
}

# We only check for modules. We don't load them.
# User/Admin is supposed to load ifb during boot time,
# ensuring that there are enough free ifbs in the system.
# Other modules will be loaded automatically by tc commands.
check_modules() {
    for m in ifb sch_plug sch_ingress act_mirred cls_u32
    do
        if ! modinfo $m > /dev/null 2>&1; then
            fatal "Unable to find $m kernel module"
        fi
    done
}

#return 0 if the ifb is free
check_ifb() {
    local installed=`nl-qdisc-list -d $1`
    [ -n "$installed" ] && return 1

    for domid in `xenstore-list "/local/domain" 2>/dev/null || true`
    do
        [ $domid -eq 0 ] && continue
        xenstore-exists "/libxl/$domid/remus/netbuf" || continue
        for devid in `xenstore-list "/libxl/$domid/remus/netbuf" 2>/dev/null || true`
        do
            local path="/libxl/$domid/remus/netbuf/$devid/ifb"
            xenstore-exists $path || continue
            local ifb=`xenstore-read "$path" 2>/dev/null || true`
            [ "$ifb" = "$1" ] && return 1
        done
    done

    return 0
}

setup_ifb() {

    for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
    do
        check_ifb "$ifb" || continue
        REMUS_IFB="$ifb"
        break
    done

    if [ -z "$REMUS_IFB" ]
    then
        fatal "Unable to find a free ifb device for $vifname"
    fi

    #not using xenstore_write that automatically exits on error
    #because we need to cleanup
    xenstore_write "$XENBUS_PATH/ifb" "$REMUS_IFB"
    do_or_die ip link set dev "$REMUS_IFB" up
}

redirect_vif_traffic() {
    local vif=$1
    local ifb=$2

    do_or_die tc qdisc add dev "$vif" ingress

    tc filter add dev "$vif" parent ffff: proto ip prio 10 \
        u32 match u32 0 0 action mirred egress redirect dev "$ifb" >/dev/null 2>&1

    if [ $? -ne 0 ]
    then
        do_without_error tc qdisc del dev "$vif" ingress
        fatal "Failed to redirect traffic from $vif to $ifb"
    fi
}

add_plug_qdisc() {
    local vif=$1
    local ifb=$2

    nl-qdisc-add --dev="$ifb" --parent root plug >/dev/null 2>&1
    if [ $? -ne 0 ]
    then
        do_without_error tc qdisc del dev "$vif" ingress
        fatal "Failed to add plug qdisc to $ifb"
    fi

    #set ifb buffering limit in bytes. Its okay if this command fails
    nl-qdisc-add --dev="$ifb" --parent root \
        --update plug --limit=10000000 >/dev/null 2>&1 || true
}

teardown_netbuf() {
    local vif=$1
    local ifb=$2

    #Check if the XENBUS_PATH/ifb exists and has IFB name same as REMUS_IFB.
    #Otherwise, if the teardown op is called multiple times, then we may end
    #up freeing another domain's allocated IFB inside the if loop.
    xenstore-exists "$XENBUS_PATH/ifb" && \
        local ifb2=`xenstore-read "$XENBUS_PATH/ifb" 2>/dev/null || true`

    if [[ "$ifb2" && "$ifb2" == "$ifb" ]]; then
        do_without_error ip link set dev "$ifb" down
        do_without_error nl-qdisc-delete --dev="$ifb" --parent root plug >/dev/null 2>&1
        xenstore-rm -t "$XENBUS_PATH/ifb" 2>/dev/null || true
    fi
    do_without_error tc qdisc del dev "$vif" ingress
    xenstore-rm -t "$XENBUS_PATH/hotplug-status" 2>/dev/null || true
    xenstore-rm -t "$XENBUS_PATH/hotplug-error" 2>/dev/null || true
}

case "$command" in
    setup)
        check_libnl_tools
        check_modules

        claim_lock "pickifb"
        setup_ifb
        redirect_vif_traffic "$vifname" "$REMUS_IFB"
        add_plug_qdisc "$vifname" "$REMUS_IFB"
        release_lock "pickifb"

        success
        ;;
    teardown)
        teardown_netbuf "$vifname" "$REMUS_IFB"
        ;;
esac

log debug "Successful remus-netbuf-setup $command for $vifname, ifb $REMUS_IFB."
